GEO数据挖掘-第三期-口腔鳞状细胞癌（OSCC）

Original 生信技能树生信菜鸟团 2022-06-06

GEO数据挖掘系列文-第三期-口腔鳞状细胞癌

文章标题

A pplication of weighted gene co-expression network analysis to identify key modules and hub genes in oral squamous cell carcinoma tumorigenesis

关键词

口腔鳞状细胞癌

第一步安装包

## 整理流程中的软件包
bioPackages <- 
c( 
  "stringi",  # 处理字符串
  "GEOquery", # 下载包
  "limma",    # 差异分析
  "ggfortify", "ggplot2", "pheatmap", "ggstatsplot", "VennDiagram", # 作图
  "clusterProfiler", "org.Hs.eg.db",                                # 注释
  "AnnotationDbi", "impute", "GO.db", "preprocessCore", "WGCNA"     # WGCNA
)


## 设置软件包的下载镜像
local({
  # CRAN的镜像设置
  r <- getOption( "repos" ); 
  r[ "CRAN" ] <- "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"; 
  options( repos = r )
  # bioconductor的镜像设置
  BioC <- getOption( "BioC_mirror" ); 
  BioC[ "BioC_mirror" ] <- "https://mirrors.ustc.edu.cn/bioc/"; 
  options( BioC_mirror = BioC )
})


## 安装未安装的软件包
#source( "https://bioconductor.org/biocLite.R" ) #第一次使用bioconductor需要运行
lapply( bioPackages, 
  function( bioPackage ){
    if( !require( bioPackage, character.only = T ) ){
      CRANpackages <- available.packages()
      if( bioPackage %in% rownames( CRANpackages) ){
        install.packages( bioPackage )
      }else{
        BiocInstaller::biocLite( bioPackage, suppressUpdates = FALSE, ask = FALSE)
      }
    }
  }
)

第二步整理数据集和临床表型

## 数据集和注释平台的下载步骤略过，详见第一期
options( stringsAsFactors = F )

load( './gset.Rdata' )
library( "GEOquery" )
## 取表达矩阵和样本信息表
{
  gset = gset[[1]]
  exprSet = exprs( gset )
  pdata = pData( gset )
  chl = length( colnames( pdata ) )
  group_list = as.character( pdata[, chl] )
}
colnames(pdata)
tmp = pdata[,10:12]
library(stringr)
status= str_split(as.character(tmp$characteristics_ch1),':',simplify = T)[,2]
age = str_split(as.character(tmp$characteristics_ch1.1),':',simplify = T)[,2]
sex = str_split(as.character(tmp$characteristics_ch1.2),':',simplify = T)[,2]
tmp$status = status
tmp$age = age
tmp$sex = sex
hclust_table = tmp[,4:6]
group_list = tmp$status

load('./ID2gene.Rdata')
## 去除没有注释的数据集
{
  exprSet = exprSet[ rownames(exprSet) %in% ID2gene[ , 1 ], ]
  ID2gene = ID2gene[ match(rownames(exprSet), ID2gene[ , 1 ] ), ]
}

dim( exprSet )
dim( ID2gene )
tail( sort( table( ID2gene[ , 2 ] ) ), n = 12L )

## 相同基因的表达数据取最大值
{
  MAX = by( exprSet, ID2gene[ , 2 ], 
              function(x) rownames(x)[ which.max( rowMeans(x) ) ] )
  MAX = as.character(MAX)
  exprSet = exprSet[ rownames(exprSet) %in% MAX , ]
  rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ]
}
exprSet = log(exprSet)
dim(exprSet)
exprSet[1:5,1:5]

save( exprSet, group_list, file = 'exprSet_for_WGCNA.Rdata')

第三步绘制进化树

library("WGCNA")
## WGCNA
load('./exprSet_for_WGCNA.Rdata')
load('./final_exprSet.Rdata')
colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' )
dim( exprSet )
exprSet[1:5,1:5]

datExpr = t(exprSet[order(apply(exprSet,1,mad), decreasing = T)[1:5000],])
dim( datExpr )
datExpr[1:4,1:4]

{
  nGenes = ncol(datExpr)
  nSamples = nrow(datExpr)
  #系统聚类树
  datExpr_tree<-hclust(dist(datExpr), method = "average")
  par(mar = c(0,5,2,0))
  plot(datExpr_tree, main = "Sample clustering", sub="", xlab="",
       cex.axis = 0.9, cex.main = 1.2, cex.lab=1, cex = 0.7)

  status_colors <- numbers2colors(as.numeric(factor(hclust_table$status)), 
                                  colors = c("red","white","pink"),signed = FALSE)
  age_colors <- numbers2colors(as.numeric(factor(hclust_table$age)), 
                               colors = c("red","white","pink"),signed = FALSE)
  sex_colors <- numbers2colors(as.numeric(factor(hclust_table$sex)), 
                               colors = c("red","white"),signed = FALSE)
  par(mar = c(1,4,3,1),cex=0.8)
  plotDendroAndColors(datExpr_tree, cbind(status_colors, age_colors, sex_colors),
                      groupLabels = colnames(hclust_table),
                      cex.dendroLabels = 0.8,
                      marAll = c(1, 4, 3, 1),
                      cex.rowText = 0.01,
                      main = "Sample dendrogram and trait heatmap")
}

第四步确定软阀值

powers = c(c(1:10), seq(from = 12, to=20, by=2))
## [1]  1  2  3  4  5  6  7  8  9 10 12 14 16 18 20
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)

# Plot the results:
# Scale independence
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices$Power,
     -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,
     xlab="Soft Threshold (power)",
     ylab="Scale Free Topology Model Fit,signed R^2",
     type="n", 
     main = paste("Scale independence"))
text(sft$fitIndices$Power,
     -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,
     labels=powers,
     cex=cex1,
     col="red")
abline(h=0.90,col="RED")

# Mean connectivity
plot(sft$fitIndices$Power,
     sft$fitIndices$mean.k.,
     xlab="Soft Threshold (power)",
     ylab="Mean Connectivity",
     type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices$Power,
     sft$fitIndices$mean.k.,
     labels=powers,
     cex=cex1,
     col="red")

best_beta=sft$powerEstimate
best_beta

第五步构建共表达矩阵

net = blockwiseModules(datExpr, power = 8,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "femaleMouseTOM",
                       verbose = 3)
## 这个诡异的问题必须记录一下，第一次运行blockwiseModules很顺利，后来就开始报错了，网上给的解决方案是重启电脑，实在不行
## 把WGCNA包重装一次，详情看这里https://www.biostars.org/p/305714/

table(net$colors)

moduleColors <- labels2colors(net$colors)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
# 计算根据模块特征向量基因计算模块相异度：
MEDiss = 1-cor(MEs0);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result

plot(METree, 
     main = "Clustering of module eigengenes",
     xlab = "", 
     sub = "")
# 在聚类图中画出剪切线
MEDissThres = 0.25
abline(h=MEDissThres, col = "red")

merge_modules = mergeCloseModules(datExpr, moduleColors, cutHeight = MEDissThres, verbose = 3)
mergedColors = merge_modules$colors;
mergedMEs = merge_modules$newMEs;
plotDendroAndColors(net$dendrograms[[1]], cbind(moduleColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

第六步 TOM图

nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]]
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6)
nSelect = 400
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
sizeGrWindow(9,9)
plotDiss = selectTOM^7
diag(plotDiss) = NA
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

第七步模块与形状的关系

design=model.matrix(~0+ datTraits$subtype)
colnames(design)=levels(datTraits$subtype)
moduleColors <- labels2colors(net$colors)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, design , use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), "
(",
                   signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(hclust_table[,1:3]),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
MET = orderMEs(MEs)
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, 
                      xLabelsAngle= 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## 模块的聚类图
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
                      plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与模块热图
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
                      plotDendrograms = FALSE, xLabelsAngle = 90)

第八步感兴趣模块的具体基因分析

# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
## 算出每个模块跟基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
## datExpr是每个基因在每个样本的表达量
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
geneTraitSignificance = as.data.frame(cor(datExpr,use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
module = "turquoise"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for STATUS",
                   main = paste("Module membership vs. gene significance
"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
module = "brown"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for STATUS",
                   main = paste("Module membership vs. gene significance
"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

我们提供GEO线上视频课程，以及课后答疑服务，如有意向，请扫描下面的二维码于我联系。（仅限于购买课程用户，需提供购买截图方可入答疑群）

（长按图片扫描二维码可添加微信）

点击阅读原文直达我们的网易云课堂购买链接！

请务必思考再三哦，不要瞎买

高三女生醉酒后被强奸致死？检方回应

常德悲剧：让谴责无差别杀戮之声更加响亮一点

2024【公共营养师】培训报名通道已开启，不限学历，23岁及以上可报！还能领2000补贴

【惊】"以为要写遗书"! 飞温哥华航班遇炸弹气旋华人崩溃大哭连空姐都吐了; 客机颠簸盘旋3小时

女人最偏爱的十种男人

GEO数据挖掘-第三期-口腔鳞状细胞癌（OSCC）

您可能也对以下帖子感兴趣

高三女生醉酒后被强奸致死？检方回应

常德悲剧：让谴责无差别杀戮之声更加响亮一点

2024【公共营养师】培训报名通道已开启，不限学历，23岁及以上可报！还能领2000补贴

【惊】"以为要写遗书"! 飞温哥华航班遇炸弹气旋 华人崩溃大哭 连空姐都吐了; 客机颠簸盘旋3小时

女人最偏爱的十种男人

生成图片，分享到微信朋友圈

GEO数据挖掘-第三期-口腔鳞状细胞癌（OSCC）

您可能也对以下帖子感兴趣

【惊】"以为要写遗书"! 飞温哥华航班遇炸弹气旋华人崩溃大哭连空姐都吐了; 客机颠簸盘旋3小时